This project aims to use data from a WhatsApp group to visualize and gain some insight from it.
!pip install emoji
import re
import regex
import pandas as pd
import numpy as np
import emoji
import plotly.express as px
from collections import Counter
import matplotlib.pyplot as plt
from os import path
from PIL import Image
import datetime
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
%matplotlib inline
DATA PREPARATION:
We want our data to be presented in such relevant columns:
19/02/17,4:44AM - Pam: Where is everyone?
{Date}, {Time} - {Author}:{Message}
{19/02/17}, {4:44AM} - {Pam}:{Where is everyone?}
The function below helps us achieve this.
def startsWithDateAndTime(s):
pattern = '^([0-9]+)(\/)([0-9]+)(\/)([0-9]+), ([0-9]+):([0-9]+)[ ]?(AM|PM|am|pm)? -'
result = re.match(pattern, s)
if result:
return True
return False
startsWithDateAndTime('7/26/18, 22:51 - Bobby: This message was deleted')
def FindAuthor(s):
s=s.split(":")
if len(s)==2:
return True
else:
return False
def getDataPoint(line):
splitLine = line.split(' - ')
dateTime = splitLine[0]
date, time = dateTime.split(', ')
message = ' '.join(splitLine[1:])
if FindAuthor(message):
splitMessage = message.split(': ')
author = splitMessage[0]
message = ' '.join(splitMessage[1:])
else:
author = None
return date, time, author, message
parsedData = [] #List to keep track of data so it can be used by a Pandas dataframe
conversationPath = 'C:/Users/USER/Desktop/Semicolon/MachineLearning/WhatsApp Chat with #ChopLifeGang.txt'
with open(conversationPath, encoding="utf-8") as fp:
fp.readline() #Skipping first line of the file because it contains information about end-to-end encryption
messageBuffer = []
date, time, author = None, None, None
while True:
line = fp.readline()
if not line:
break
line = line.strip()
if startsWithDateAndTime(line):
if len(messageBuffer) > 0:
parsedData.append([date, time, author, ' '.join(messageBuffer)])
messageBuffer.clear()
date, time, author, message = getDataPoint(line)
messageBuffer.append(message)
else:
messageBuffer.append(line)
df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message']) #Initialising a Pandas dataframe
df["Date"] = pd.to_datetime(df["Date"])
df.head(2)
df.info()
df.Author.unique()
None is the first element in our array that has no author. Group was created and Caspeezie was added. Let us remove the messages tagged under None.
df = df.dropna()
df.info()
df.Author.unique()
We have successfully removed the None author!
total_messages = df.shape[0]
print (total_messages)
Let us find out the total number of media messages
media_messages = df[df['Message'] == '<Media omitted>'].shape[0]
print(media_messages)
def split_count(text):
emoji_list = []
data = regex.findall(r'\X', text)
for word in data:
if any(char in emoji.UNICODE_EMOJI for char in word):
emoji_list.append(word)
return emoji_list
df["emoji"] = df["Message"].apply(split_count)
emojis = sum(df['emoji'].str.len())
print(emojis)
URLPATTERN = r'(https?://\S+)'
df['urlcount'] = df.Message.apply(lambda x: re.findall(URLPATTERN, x)).str.len()
links = np.sum(df.urlcount)
print("Group Wise Statistics")
print("Messages:", total_messages)
print("Media:", media_messages)
print("Emojis", emojis)
print("Links:", links)
Let us seperate the media messages and text messages
media_messages_df = df[df['Message'] == '<Media omitted>']
messages_df = df.drop(media_messages_df.index)
messages_df.info()
Let's add two new columns to our dataframe that would detail the number of letters and words each author uses. We will name them "Letter_Count" and "Word_Count" respectively.
messages_df['Letter_Count'] = messages_df['Message'].apply(lambda s : len(s))
messages_df['Word_Count'] = messages_df['Message'].apply(lambda s : len(s.split(' ')))
messages_df["MessageCount"]=1
messages_df.head(2)
messages_df["emojicount"]= df['emoji'].str.len()
#Creates a list of unique Authors
l = messages_df.Author.unique()
for i in range(len(l)):
#filtering out messages of particular user
req_df = messages_df[messages_df['Author'] == l[i]]
#req_df will contain messages of only one particular user
print(f'Stats of {l[i]} -')
#shape will print number of rows which also means the number of messages
print('Messages Sent', req_df.shape[0])
#Word_Count contains total words in one message. Sum of all words/Total messages will provide words per message
words_per_message = (np.sum(req_df['Word_Count']))/req_df.shape[0]
print('Words per message', words_per_message)
#media consists of media messages
media = media_messages_df[media_messages_df['Author'] == l[i]].shape[0]
print('Media Messages Sent', media)
#emojis consists of media messages
emojis = sum(req_df['emoji'].str.len())
print('Emojis Sent', emojis)
#links consists of total links
links = sum(req_df["urlcount"])
print('Links Sent', links)
print()
total_emojis_list = list(set([a for b in messages_df.emoji for a in b]))
total_emojis = len(total_emojis_list)
print(total_emojis)
total_emojis_list = list([a for b in messages_df.emoji for a in b])
emoji_dict = dict(Counter(total_emojis_list))
emoji_dict = sorted(emoji_dict.items(), key=lambda x: x[1], reverse=True)
print(emoji_dict)
emoji_df = pd.DataFrame(emoji_dict, columns=['emoji', 'count'])
emoji_df
import plotly.express as px
fig = px.pie(emoji_df, values='count', names='emoji')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
l = messages_df.Author.unique()
for i in range(len(l)):
dummy_df = messages_df[messages_df['Author'] == l[i]]
total_emojis_list = list([a for b in dummy_df.emoji for a in b])
emoji_dict = dict(Counter(total_emojis_list))
emoji_dict = sorted(emoji_dict.items(), key=lambda x: x[1], reverse=True)
print('Emoji Distribution for', l[i])
author_emoji_df = pd.DataFrame(emoji_dict, columns=['emoji', 'count'])
fig = px.pie(author_emoji_df, values='count', names='emoji')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
def f(i):
l = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
return l[i];
day_df=pd.DataFrame(messages_df["Message"])
day_df['day_of_date'] = messages_df['Date'].dt.weekday
day_df['day_of_date'] = day_df["day_of_date"].apply(f)
day_df["messagecount"] = 1
day = day_df.groupby("day_of_date").sum()
day.reset_index(inplace=True)
fig = px.line_polar(day, r='messagecount', theta='day_of_date', line_close=True)
fig.update_traces(fill='toself')
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True
),
),
showlegend=False
)
fig.show()
We can notice that the most messages has been posted on a Monday! Then probably followed by Thursday...
date_df = messages_df.groupby("Date").sum()
date_df.reset_index(inplace=True)
fig = px.line(date_df, x="Date", y="MessageCount")
fig.update_xaxes(nticks=20)
fig.show()
auth = messages_df.groupby("Author").sum()
auth.reset_index(inplace=True)
fig = px.bar(auth, y="Author", x="MessageCount", color='Author', orientation="h",
color_discrete_sequence=["red", "green", "blue", "goldenrod", "magenta"],
title="Explicit color sequence"
)
fig.show()
Our Chatter happens to be Xzeeecool with messages of more than 1000!!! Congratulations
messages_df['Time'].value_counts().head(15).plot.barh()
plt.xlabel('Number of messages')
plt.ylabel('Time')
As noticed above, the group comes alive mostly at night, something around 9:00pm.
messages_df['Date'].value_counts().head(12).plot.barh()
print(messages_df['Date'].value_counts())
plt.xlabel('Number of Messages')
plt.ylabel('Date')
The most happening day coincidentally happened to be yesterday, 26th October, 2020. I guess the lectures @VynzeCent was giving really had everyone involved!!!
A word cloud is a visual representation of words in a particular text. The size of words in the word cloud is directly proportional to the frequency of that word in a text. We will create a word cloud for all the messages in the group.
text = " ".join(review for review in messages_df.Message)
print("There are {} words in all the messages.".format(len(text)))
stopwords = set(STOPWORDS)
stopwords.update(["ra", "ga", "na", "ani", "em", "ki", "ah","ha","la","eh","ne","le"])
#Generates a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
#Display the generated image, the matplotlib way:
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
So far, "dey" and "go" are the most used words on the ChopLifeGang group chat!! It is why the words are boldly printed out
We can conclude that some interesting details could be derived from this little data analysis. Though this work is still a work in progress, feel free to make use of the code and try out your analysis too!!!